{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# 安装依赖"
      ],
      "metadata": {
        "id": "uVU6uoaoLsqd"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "id": "4L9ogBJb8mS6",
        "jupyter": {
          "outputs_hidden": true
        }
      },
      "outputs": [],
      "source": [
        "import locale\n",
        "def getpreferredencoding(do_setlocale = True):\n",
        "    return \"UTF-8\"\n",
        "locale.getpreferredencoding = getpreferredencoding\n",
        "!pip install pysrt\n",
        "from tqdm import tqdm\n",
        "import pysrt\n",
        "google_drive=False\n",
        "import os"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4mLQai2UV7s4"
      },
      "source": [
        "# 选择翻译方式"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "id": "_Roehqbxe751",
        "jupyter": {
          "outputs_hidden": true
        },
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title 翻译模型翻译方式，同时加载模型，查看可用语言（最后一步的源语言和目标语言只能从可用语言里面选，每个模型可用语言各不相同）\n",
        "!pip install dl_translate\n",
        "import dl_translate as dlt\n",
        "\n",
        "翻译模型 = \"facebook/nllb-200-distilled-600M\" # @param [\"facebook/m2m100_418M\",\"facebook/m2m100_1.2B\",\"facebook/mbart-large-50-many-to-many-mmt\", \"facebook/nllb-200-distilled-600M\"]\n",
        "\n",
        "mt = dlt.TranslationModel(翻译模型,device=\"auto\")\n",
        "print(\"可用语言：\")\n",
        "print(\"\\n\".join(mt.available_languages()))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "collapsed": true,
        "id": "XmHk-1qefFWD",
        "jupyter": {
          "outputs_hidden": true
        }
      },
      "outputs": [],
      "source": [
        "#@title 谷歌翻译方式（免费调用，不稳定，最后一步的源语言和目标语言只能从可用语言里面选）\n",
        "!pip install googletrans==3.1.0a0\n",
        "import googletrans\n",
        "translator = googletrans.Translator()\n",
        "print(\"可用语言：\")\n",
        "print(\"\\n\".join(googletrans.LANGUAGES))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IZKUencSOo0R",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title ChatGLM3翻译方式（系统提示词中\"{}\"将被替换为目标语言）\n",
        "系统提示词 = \"\\u4F60\\u662F\\u4E00\\u4E2A\\u8D1F\\u8D23\\u7FFB\\u8BD1\\u5B57\\u5E55\\u7684\\u7A0B\\u5E8F\\u3002\\u60A8\\u7684\\u4EFB\\u52A1\\u662F\\u5C06\\u8F93\\u5165\\u7684\\u5B57\\u5E55\\u7684\\u539F\\u59CB\\u8BED\\u8A00\\u7FFB\\u8BD1\\u4E3A{}\\u3002DO NOT\\u8F93\\u51FA\\u9664\\u7FFB\\u8BD1\\u4EE5\\u5916\\u7684\\u4EFB\\u4F55\\u6587\\u672C\\u3002\" #@param {type: \"string\"}\n",
        "!pip install transformers sentencepiece\n",
        "from transformers import AutoTokenizer, AutoModel\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\", trust_remote_code=True)\n",
        "model = AutoModel.from_pretrained(\"THUDM/chatglm3-6b\", trust_remote_code=True, device='cuda')\n",
        "model = model.eval()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 连接到谷歌云盘\n",
        "如果文件要保存到谷歌云盘，翻译步骤的“保存到”或“恢复文件”应以`drive/MyDrive/`开头"
      ],
      "metadata": {
        "id": "Zd1JbOwXGErL"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "id": "Nv8Q47mRv-t7",
        "jupyter": {
          "outputs_hidden": true
        }
      },
      "outputs": [],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zUbi83308mS9"
      },
      "source": [
        "# 翻译\n",
        "提示：如未指定恢复文件，则程序将不会保存恢复文件"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DFYpgf6fCtfb"
      },
      "outputs": [],
      "source": [
        "源文件 = '\\u5F85\\u7FFB\\u8BD1.srt'  #@param {type: \"string\"}\n",
        "保存到 = 'drive/MyDrive/\\u7FFB\\u8BD1.srt'  #@param {type: \"string\"}\n",
        "源语言 = 'en'  #@param {type: \"string\"}\n",
        "目标语言 = 'zh-cn'  #@param {type: \"string\"}\n",
        "翻译方式 = \"\\u8C37\\u6B4C\\u7FFB\\u8BD1\" #@param [\"谷歌翻译\", \"大模型翻译\",\"ChatGLM3翻译\",\"不翻译\"]\n",
        "恢复文件 = 'drive/MyDrive/\\u6062\\u590D\\u6587\\u4EF6.srt'  #@param {type: \"string\"}\n",
        "恢复间隔 = 20  #@param {type: \"integer\"}\n",
        "def translate(sentence):\n",
        "    if 翻译方式==\"大模型翻译\":\n",
        "        return mt.translate(sentence, source=源语言, target=目标语言)\n",
        "    elif 翻译方式==\"谷歌翻译\":\n",
        "        return translator.translate(sentence, src=源语言, dest=目标语言).text\n",
        "    elif 翻译方式==\"ChatGLM3翻译\":\n",
        "        response,now_history=model.chat(tokenizer,sentence,top_p=1,temperature=0.01,history=[{\"role\": \"system\", \"content\": 系统提示词.format(目标语言)}])\n",
        "        return response\n",
        "    elif 翻译方式==\"不翻译\":\n",
        "        return sentence\n",
        "def merge_subtitles(subtitles):\n",
        "    merged_subtitles=pysrt.SubRipFile()\n",
        "    current_sentence = \"\"\n",
        "    current_start = None\n",
        "    current_end = None\n",
        "    for i in range(len(subtitles)):\n",
        "        now_sub=subtitles[i]\n",
        "        if i<len(subtitles)-1:\n",
        "            next_sub=subtitles[i+1]\n",
        "        #分割条件：当前句子结尾有“.”，下一个句子start与当前句子end不相等，已经是最后一个句子\n",
        "        #若符合条件，则对当前句子进行分割\n",
        "        if current_start is None:\n",
        "            current_start=now_sub.start\n",
        "        current_end=now_sub.end\n",
        "        current_sentence+=now_sub.text+\" \"\n",
        "        if i==len(subtitles)-1 or now_sub.text[-1]==\".\" or next_sub.start!=now_sub.end:\n",
        "            #创建新的字幕项并添加到结果\n",
        "            sub = pysrt.SubRipItem(index=len(merged_subtitles) + 1,\n",
        "                                             start=current_start, end=current_end,\n",
        "                                             text=current_sentence.strip())\n",
        "            merged_subtitles.append(sub)\n",
        "            #重置参数\n",
        "            current_sentence = \"\"\n",
        "            current_start = None\n",
        "            current_end = None\n",
        "    return merged_subtitles\n",
        "\n",
        "def translate_with_progress(subtitles):\n",
        "    print(\"对字幕进行合并中...\")\n",
        "    merged_subtitles=merge_subtitles(subtitles)\n",
        "    print(\"字幕合并完成，开始翻译\")\n",
        "    # 是否进行恢复\n",
        "    if 恢复文件:\n",
        "        #如果输入了恢复文件\n",
        "        if not os.path.exists(恢复文件):\n",
        "            #如果恢复文件不存在，则创建它\n",
        "            print(\"恢复文件不存在，已创建新的恢复文件\")\n",
        "            with open(恢复文件, \"w\") as f:pass\n",
        "            recover_index = 0\n",
        "            translated_subs = pysrt.SubRipFile()\n",
        "        else:\n",
        "            # 读取恢复文件\n",
        "            recovered_subs = pysrt.open(恢复文件, encoding='utf-8')\n",
        "            #有恢复文件，从恢复文件结尾开始继续\n",
        "            recover_index=len(recovered_subs)\n",
        "            translated_subs = recovered_subs\n",
        "            print(\"找到恢复文件，从第{}句继续翻译\".format(recover_index))\n",
        "    else:\n",
        "        recover_index = 0\n",
        "        translated_subs = pysrt.SubRipFile()\n",
        "\n",
        "    # 使用 tqdm 显示进度条\n",
        "    progress_bar = tqdm(total=len(merged_subtitles), desc=\"翻译中\", unit=\"个\")\n",
        "    progress_bar.update(recover_index)\n",
        "    for i in range(recover_index, len(merged_subtitles)):\n",
        "        translated = pysrt.SubRipItem(index=len(translated_subs) + 1,\n",
        "                                             start=merged_subtitles[i].start, end=merged_subtitles[i].end,\n",
        "                                             text=translate(merged_subtitles[i].text).strip())\n",
        "        translated_subs.append(translated)\n",
        "        progress_bar.update(1)\n",
        "        if 恢复文件 and i%恢复间隔==0:\n",
        "            translated_subs.save(恢复文件,encoding='utf-8')\n",
        "    # 关闭进度条\n",
        "    progress_bar.close()\n",
        "    return translated_subs\n",
        "\n",
        "# 读取SRT文件\n",
        "subs = pysrt.open(源文件, encoding='utf-8')\n",
        "\n",
        "# 获取翻译后的字幕\n",
        "translated_subs = translate_with_progress(subs)\n",
        "print(\"\\n-------------------------\\n翻译完毕！\")\n",
        "# 保存翻译后的SRT文件\n",
        "translated_subs.save(保存到, encoding='utf-8')\n",
        "print(\"文件保存到了：{}\".format(保存到))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "uVU6uoaoLsqd",
        "4mLQai2UV7s4",
        "Zd1JbOwXGErL"
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}